Collect and Process /r/glassine Dataset

Load the relevant libraries:

library(RedditExtractoR)
library(stringr)
library(knitr)

Check whether the raw data is available in the working directory. If it isn’t collect the data from the web.

if (file.exists("glassine_urls.csv")) {
  glassine_urls <- read.csv("glassine_urls.csv")
} else {
  glassine_urls <- reddit_urls(subreddit = "glassine",page_threshold = 40)
}

if (file.exists("area_codes_by_state.csv")) {
  area_codes_by_state <- read.csv("area_codes_by_state.csv")
} else {
  sprintf("use URL to get xls file -- file.download() didn't work for me")
}

Click here for the area codes by state table if the data loading failed

Examine the glassine data to decide how to parse the content.

summary(glassine_urls)
##        date      num_comments          title        subreddit  
##  29-08-17: 11   Min.   : 0.000   (R) 412  : 20   glassine:998  
##  13-10-17:  9   1st Qu.: 1.000   (HAT) 412: 11                 
##  18-01-17:  9   Median : 3.000   (R)      :  5                 
##  19-01-17:  9   Mean   : 3.955   412      :  4                 
##  24-01-17:  8   3rd Qu.: 5.000   (D) 215  :  3                 
##  28-02-17:  8   Max.   :36.000   (R) ptown:  3                 
##  (Other) :944                    (Other)  :952                 
##                                                                                                   URL     
##  http://www.reddit.com/r/glassine/comments/5o6lm8/412/                                              :  1  
##  http://www.reddit.com/r/glassine/comments/5o6xwi/paterson_john_gottiblue_stamp_dope_dickblue_stamp/:  1  
##  http://www.reddit.com/r/glassine/comments/5o6zyo/deebos/                                           :  1  
##  http://www.reddit.com/r/glassine/comments/5o789k/tight_blue/                                       :  1  
##  http://www.reddit.com/r/glassine/comments/5o7f6z/412_blue_house_party/                             :  1  
##  http://www.reddit.com/r/glassine/comments/5o8daj/ptown_blue_koolaid/                               :  1  
##  (Other)                                                                                            :992
head(glassine_urls)
##       date num_comments
## 1 27-11-17            0
## 2 27-11-17            0
## 3 27-11-17            0
## 4 27-11-17            1
## 5 27-11-17            1
## 6 27-11-17            3
##                                                                                                                 title
## 1                                                                              (R) Birthday purple stamp with cupcake
## 2                                                                                                 (HAT) Tuna fish 973
## 3                                                                                                     (HAT) 412 Kenzo
## 4                                                                                           (Hat) trap God. In p town
## 5 (HAT) deep nod, ninja turtles green stamp, cocaine cowboys rainbow stamp, and skittles rainbow. All ptown east side
## 6                                                                                                           (HAT) 412
##   subreddit
## 1  glassine
## 2  glassine
## 3  glassine
## 4  glassine
## 5  glassine
## 6  glassine
##                                                                                                URL
## 1           http://www.reddit.com/r/glassine/comments/7fyih9/r_birthday_purple_stamp_with_cupcake/
## 2                              http://www.reddit.com/r/glassine/comments/7fxxtf/hat_tuna_fish_973/
## 3                                  http://www.reddit.com/r/glassine/comments/7fxv7q/hat_412_kenzo/
## 4                         http://www.reddit.com/r/glassine/comments/7fwyzu/hat_trap_god_in_p_town/
## 5 http://www.reddit.com/r/glassine/comments/7fwuc0/hat_deep_nod_ninja_turtles_green_stamp_cocaine/
## 6                                        http://www.reddit.com/r/glassine/comments/7fvk9b/hat_412/

The “title” column of the dataset mentions the area code for each thread. One possible parsing procedure could extract any three-digit number within the title column. This will produce an array of candidate area codes.

candidate_area_codes<-str_extract(glassine_urls$title, "[0-9]{3}")

Create frequency table for area code mentions:

ac_tbl<-as.data.frame(sort(table(candidate_area_codes),decreasing=TRUE))
kable(ac_tbl,align='ccc')
candidate_area_codes Freq
412 263
973 205
215 52
609 38
724 13
100 4
201 3
908 3
315 2
357 2
570 2
007 1
173 1
267 1
413 1
480 1
540 1
585 1
631 1
708 1
738 1
757 1
777 1
845 1
862 1
864 1
872 1
978 1
983 1
Some of these candidates are not area codes; e.g. ‘007’. However, most candidates do have a corresponding area code. We can validate our candidates against the independent dataset from the area_codes_by_state data in the Area.code column.
valid_ac<-ac_tbl$candidate_area_codes %in% area_codes_by_state$Area.code
kable(ac_tbl[!valid_ac,],align='ccc')
candidate_area_codes Freq
6 100 4
10 357 2
12 007 1
13 173 1
21 738 1
23 777 1
29 983 1
After removing the invalid ca ndidates, the frequency table is ready to be used for constructing the cholopleth maps for /r/glassine mentions.
valid_ac<-ac_tbl$candidate_area_codes %in% area_codes_by_state$Area.code
ac_tbl<-ac_tbl[valid_ac,]
kable(ac_tbl,align='ccc')
candidate_area_codes Freq
1 412 263
2 973 205
3 215 52
4 609 38
5 724 13
7 201 3
8 908 3
9 315 2
11 570 2
14 267 1
15 413 1
16 480 1
17 540 1
18 585 1
19 631 1
20 708 1
22 757 1
24 845 1
25 862 1
26 864 1
27 872 1
28 978 1

Collect and process shapefile data

Cholopleth maps shade different map regions according to a variable of interest. In this case, the shading will reflect the number of mentions a given area code has in the glassine dataset. But to shade a given map region we will have to first define its location. This involves finding data that delineates the extent of the area codes within USA.

I’m getting ahead of myself a bit here, but I’m including one more processing step on the frequency table. This is because of a plotting issue I run into later on with the maps. The essence of the problem is that these low-count area codes completely overlap with some high-count area codes, making it difficult to see the shading for the high-count area codes. There is probably a more elegant way of handling the problem. For now, here’s the hard-coded step:

ac_tbl<-ac_tbl[!(names(ac_tbl) %in% c("267","862"))]

Load map-making libraries:

# load up libraries:
library(maptools)
## Loading required package: sp
## Checking rgeos availability: TRUE
library(sp)
library(rgdal)
## rgdal: version: 1.2-16, (SVN revision 701)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.2.0, released 2017/04/28
##  Path to GDAL shared files: C:/Users/Reeves/Documents/R/win-library/3.3/rgdal/gdal
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: C:/Users/Reeves/Documents/R/win-library/3.3/rgdal/proj
##  Linking to sp version: 1.2-5
library(RColorBrewer)
library(ggplot2)
library(broom)
library(ggmap)
library(rgeos)
## rgeos version: 0.3-26, (SVN revision 560)
##  GEOS runtime version: 3.6.1-CAPI-1.10.1 r0 
##  Linking to sp version: 1.2-5 
##  Polygon checking: TRUE

Unzip the AreaCode.zip file to extract the AreaCode.shp file delineating the extent of each area code in lat,lon coordinates. Read in the .shp file and then remove all extracted files to clear up space.

if (file.exists("AreaCode.zip")) {
  unzip("AreaCode.zip")
  area <- readOGR("AreaCode.shp")
} else {
  download.file("https://www.sciencebase.gov/catalog/file/get/4f4e4a19e4b07f02db605716?facet=AreaCode","AreaCode.zip",mode="wb")
  unzip("AreaCode.zip")
  area <- readOGR("AreaCode.shp")
}
## OGR data source with driver: ESRI Shapefile 
## Source: "AreaCode.shp", layer: "AreaCode"
## with 336 features
## It has 12 fields
file.remove("AreaCode.dbf","AreaCode.prj","AreaCode.sbn",
            "AreaCode.sbx","AreaCode.shx","AreaCode.shp","AreaCode.shp.xml")
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE

select shapes only for area codes mentioned in the frequency table

glassine_area<-area[area$NPA %in% ac_tbl$candidate_area_codes,]

add a column of data indicating number of mentions per area code

ac_ref<-match(glassine_area@data$NPA,ac_tbl$candidate_area_codes)
glassine_area@data$TALLY <- as.vector(ac_tbl[ac_ref,2])

For area code labels, use the centroid of an area code, which is found using a function in the rgeos package. Find lat,lon of centroids and add them as two columns of data.

glassine_area$CENTER.x<-tidy(gCentroid(glassine_area, byid=TRUE))[,1]
## Warning in tidy.default(gCentroid(glassine_area, byid = TRUE)): No method
## for tidying an S3 object of class SpatialPoints , using as.data.frame
glassine_area$CENTER.y<-tidy(gCentroid(glassine_area, byid=TRUE))[,2]
## Warning in tidy.default(gCentroid(glassine_area, byid = TRUE)): No method
## for tidying an S3 object of class SpatialPoints , using as.data.frame

Make proper transform – not sure if this is necessary

glassine_WGS84 <- spTransform(glassine_area, CRS("+init=epsg:4326"))
glassine_df_WGS84 <- tidy(glassine_WGS84)
## Regions defined for each Polygons
glassine_WGS84$polyID <- sapply(slot(glassine_WGS84, "polygons"), function(x) slot(x, "ID"))
glassine_df_WGS84 <- merge(glassine_df_WGS84, glassine_WGS84, by.x = "id", by.y="polyID")

Cholopleth Maps of /r/glassine Dataset

Create a cholopleth map of mainland USA

usa_basemap <- get_map(location="United States", zoom=4, maptype = 'satellite')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=United+States&zoom=4&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20States&sensor=false
ggmap(usa_basemap) +
        geom_polygon(data = glassine_df_WGS84, 
                     aes(x=long, y=lat, group = group, # coordinates, and group them by polygons
                         fill = TALLY), alpha = 0.5) + # variable to use for filling
        scale_fill_gradient(low="#bfefff",high="red")+
        ggtitle("Area Code Mentions in /r/glassine")

Create a cholopleth map of PA-NJ region

pa_basemap <- get_map(location="PA", zoom=6, maptype = 'satellite')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=PA&zoom=6&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PA&sensor=false
ggmap(pa_basemap) +
        geom_polygon(data = glassine_df_WGS84, 
                    aes(x=long, y=lat, group = group, # coordinates, and group them by polygons
                        fill = TALLY), alpha = .8) + # variable to use for filling
        scale_fill_gradient(low="#bfefff",high="red")+
        geom_text(data = glassine_df_WGS84,aes(x=CENTER.x,y=CENTER.y,
                        label=ifelse(TALLY>20,as.character(NPA),'')))+
        ggtitle("Area Code Mentions in /r/glassine")
## Warning: Removed 12487 rows containing missing values (geom_text).